We load the data from the Competence Centre for Bibliometrics: http://www.bibliometrie.info/. They licence access to the Web of Science and Scopus bibliometric databases, spanning a high proportion of all peer-reviewed research literature. The Competence Centre for Bibliometrics further processes both databases' data, so that it can be queried with SQL.
In [1]:
import cx_Oracle #ensure that OS, InstantClient (Basic, ODBC, SDK) and cx_Oracle are all 64 bit. Install with "pip install cx_Oracle". Add link to InstantClient in Path variable!
import pandas as pd
import re
import plotly.plotly as py
import plotly.graph_objs as go
In [2]:
#parameter:
searchterm="big data" #lowecase!
colorlist=["#01be70","#586bd0","#c0aa12","#0183e6","#f69234","#0095e9","#bd8600","#007bbe","#bb7300","#63bcfc","#a84a00","#01bedb","#82170e","#00c586","#a22f1f","#3fbe57","#3e4681","#9bc246","#9a9eec","#778f00","#00aad9","#fc9e5e","#01aec1","#832c1e","#55c99a","#dd715b","#017c1c","#ff9b74","#009556","#83392a","#00b39b","#8e5500","#50a7c6","#f4a268","#02aca7","#532b00","#67c4bd","#5e5500","#f0a18f","#007229","#d2b073","#005d3f","#a5be6b","#2a4100","#8cb88c","#2f5c00","#007463","#5b7200","#787c48","#3b7600"]
In [ ]:
dsn_tns=cx_Oracle.makedsn('127.0.0.1','6025',service_name='bibliodb01.fiz.karlsruhe') #due to licence requirements,
# access is only allowed for members of the competence center of bibliometric and cooperation partners. You can still
# continue with the resulting csv below.
#open connection:
db=cx_Oracle.connect(<username>, <password>, dsn_tns)
print(db.version)
In [ ]:
#%% define sql-query function:
def read_query(connection, query):
cursor = connection.cursor()
try:
cursor.execute( query )
names = [ x[0] for x in cursor.description]
rows = cursor.fetchall()
return pd.DataFrame( rows, columns=names)
finally:
if cursor is not None:
cursor.close()
In [ ]:
#%% load paper titles from WOSdb:
database="wos_B_2016"
command="""SELECT DISTINCT(ARTICLE_TITLE), PUBYEAR
FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS
WHERE
"""+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS
AND (lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%' OR lower(ARTICLE_TITLE) LIKE '%"""+searchterm+"""%')
"""
dfWOS=read_query(db,command)
dfWOS['wos']=True #to make the source identifyable
dfWOS.to_csv("all_big_data_titles_year_wos.csv", sep=';')
#%% load paper titles from SCOPUSdb:
database="SCOPUS_B_2016"
command="""SELECT DISTINCT(ARTICLE_TITLE), PUBYEAR
FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS
WHERE
"""+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS
AND (lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%' OR lower(ARTICLE_TITLE) LIKE '%"""+searchterm+"""%')
"""
dfSCOPUS=read_query(db,command)
dfSCOPUS['scopus']=True #to make the source identifyable
dfSCOPUS.to_csv("all_big_data_titles_year_scopus.csv", sep=';')
#this takes some time, we will work with the exported CSV from here on
In [14]:
dfWOS=pd.read_csv("all_big_data_titles_year_wos.csv",sep=";")
dfSCOPUS=pd.read_csv("all_big_data_titles_year_scopus.csv",sep=";")
df=pd.merge(dfWOS,dfSCOPUS,on='ARTICLE_TITLE',how='outer')
#get PUBYEAR in one column:
df.loc[df['wos'] == 1, 'PUBYEAR_y'] = df['PUBYEAR_x']
#save resulting csv again:
df=df[['ARTICLE_TITLE','PUBYEAR_y','wos','scopus']]
df.to_csv("all_big_data_titles_with_year.csv", sep=';')
df
Out[14]:
In [17]:
grouped=df.groupby(['PUBYEAR_y'])
df2=grouped.agg('count').reset_index()
df2
Out[17]:
In [46]:
#set data for horizontal bar plot:
data = [go.Bar(
x=[pd.DataFrame.sum(df2)['wos'],pd.DataFrame.sum(df2)['scopus'],pd.DataFrame.sum(df2)['ARTICLE_TITLE']],
y=['Web of Science', 'Scopus', 'Total'],
orientation = 'h',
marker=dict(
color=colorlist
)
)]
#py.plot(data, filename='big_data_papers_horizontal') #for uploading to plotly
py.iplot(data, filename='horizontal-bar')
Out[46]:
In [47]:
#set data for stacked bar plot:
trace1 = go.Bar(
x=df2['PUBYEAR_y'],
y=df2['wos'],
name='Web of Science',
marker=dict(
color=colorlist[0]
)
)
trace2 = go.Bar(
x=df2['PUBYEAR_y'],
y=df2['scopus'],
name='Scopus',
marker=dict(
color=colorlist[1]
)
)
trace3 = go.Bar(
x=df2['PUBYEAR_y'],
y=df2['ARTICLE_TITLE'],
name='All Papers',
marker=dict(
color=colorlist[2]
)
)
data = [trace1, trace2,trace3]
In [54]:
#set layout for stacked bar chart with logarithmic y scale:
#set layout for stacked bar chart with normal y scale:
layout_no_log = go.Layout(
title='Big data papers over time',
barmode='group',
xaxis=dict(
title='year',
titlefont=dict(
family='Arial, sans-serif',
size=14,
color='lightgrey'
),
tickfont=dict(
family='Arial, sans-serif',
size=10,
color='black'
),
showticklabels=True,
dtick=1,
tickangle=45,
)
)
#plot:
fig1 = go.Figure(data=data, layout=layout_no_log)
py.iplot(fig1, filename='big_data_papers_no_log')
In [44]:
layout_log = go.Layout(
title='Big data papers over time (log y-scale)',
barmode='group',
xaxis=dict(
title='year',
titlefont=dict(
family='Arial, sans-serif',
size=14,
color='lightgrey'
),
tickfont=dict(
family='Arial, sans-serif',
size=10,
color='black'
),
showticklabels=True,
dtick=1,
tickangle=45,
),
yaxis=dict(
type='log'
)
)
fig2 = go.Figure(data=data, layout=layout_log)
py.iplot(fig2, filename='big_data_papers_log')
Out[44]: